#!/bin/bash
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.

set -ex

NUM_HOSTS_FILE=$1
NUM_HOSTS=$2
IS_IPV6=$3

if [[ -z "${CUDA_HOME}" ]]; then
    echo "CUDA_HOME variable is empty, please define it in dockerfile"
    exit 1
fi

TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type)

GPU_COUNT=$(nvidia-smi -L | wc -l)
NODES=$(($GPU_COUNT * $NUM_HOSTS))


PRETTY_NAME=$(cat /etc/os-release | grep PRETTY_NAME)
TRAINING_LOG="/test/v2/logs/testEFA.log"

USE_DEVICE_RDMA_ARG=""

# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html
# g5.24xlarge we use in RC is not RDMA read supported
if [[ ${INSTANCE_TYPE} == p4d.24xlarge || ${INSTANCE_TYPE} == p4de.24xlarge  || ${INSTANCE_TYPE} == p5.48xlarge ]]; then
  USE_DEVICE_RDMA_ARG="-x FI_EFA_USE_DEVICE_RDMA=1"
fi

validate_all_reduce_performance_logs(){
    grep "aws-ofi-nccl" ${TRAINING_LOG} || { echo "aws-ofi-nccl is not working, please check if it is installed correctly"; exit 1; }
    grep -i "NET/OFI Selected [Pp]rovider is efa" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
    # EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric"
    grep -E "Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
    if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then
        grep "NCCL_TOPO_FILE set by environment to" ${TRAINING_LOG}
        # EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA
        grep -E "NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG}
    fi
}

check_efa_nccl_all_reduce_performance(){
    benchmark=$(cat $TRAINING_LOG | grep '1073741824' | tail -n1 | awk -F " " '{{print $11}}' | sed 's/ //' | sed  's/  5e-07//')
    echo "Benchmark throughput: ${benchmark}"
    if [[ -z "${benchmark}" ]]; then
        echo "benchmark variable is empty"
        exit 1
    fi

    # The standard throughput should be at least 41 for 2 p4d with 4 EFA devices
    # However, if the 2 instances are not in the same A-Z in the same region, performance can decrease.
    # To account for this we need to modify thresholds dynamically based on where instances are.
    # Temporarily setting these to be < 50% of optimal until AWS OFI NCCL team has concrete numbers for this.
    PERFORMANCE_THRESHOLD="3"

    if [[ $(echo "$benchmark $PERFORMANCE_THRESHOLD" | awk '{print ($1 >= $2)}') == 1 ]]; then
        echo "***************************** check_efa_nccl_all_reduce_performance passed *****************************"
    else
        echo "***************************** check_efa_nccl_all_reduce_performance failed *****************************"
        exit 1
    fi
}

check_efa_nccl_all_reduce(){
    echo "Running all_reduce_perf test"

    if [[ ${IS_IPV6} == "True" ]]; then
        echo "Running all_reduce_perf test with IPv6: using IPv6 mode with NCCL_SOCKET_FAMILY=AF_INET6"
        mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \
            -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
            -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \
            -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
            -x NCCL_SOCKET_FAMILY=AF_INET6 \
            /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}"
    else
        echo "Running all_reduce_perf test with IPv4: using default IPv4 mode"
        # Need to pass -x PATH because rank non-zero nodes seem to "forget" the value of PATH that is pre-configured into
        # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch
        # versions in DLC images.
        mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \
            -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
            -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \
            -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
            /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}"
    fi
    
    RETURN_VAL=${PIPESTATUS[0]}
    # In case, if you would like see logs, uncomment below line
    # RESULT=$(cat ${TRAINING_LOG})

    if [ ${RETURN_VAL} -eq 0 ]; then
        echo "***************************** check_efa_nccl_all_reduce passed *****************************"
    else
        echo "***************************** check_efa_nccl_all_reduce failed *****************************"
    fi
    validate_all_reduce_performance_logs
    check_efa_nccl_all_reduce_performance
}

check_efa_nccl_all_reduce
